%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
np.random.seed(0)
plt.style.use("ggplot")
import tensorflow as tf
print('Tensorflow version:', tf.__version__)
print('GPU detected:', tf.config.list_physical_devices('GPU'))
Tensorflow version: 2.4.0 GPU detected: []
!nvidia-smi
Mon Jun 7 01:04:51 2021
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.27 Driver Version: 460.32.03 CUDA Version: 11.2 |
|-------------------------------+----------------------+----------------------+
| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|===============================+======================+======================|
| 0 Tesla T4 Off | 00000000:00:04.0 Off | 0 |
| N/A 56C P0 29W / 70W | 442MiB / 15109MiB | 0% Default |
| | | N/A |
+-------------------------------+----------------------+----------------------+
+-----------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=============================================================================|
+-----------------------------------------------------------------------------+
tagged entities:
| Tag | Explaination |
|---|---|
| geo | Geographical Entity |
| org | Organization |
| per | Person |
| gpe | Geopolitical Entity |
| tim | Time indicator |
| art | Artifact |
| eve | Event |
| nat | Natural Phenomenon |
from google.colab import drive
drive.mount("/content/drive")
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
data = pd.read_csv("ner_dataset.csv", encoding="latin1")
data = data.fillna(method="ffill")
data.head(20)
| Sentence # | Word | POS | Tag | |
|---|---|---|---|---|
| 0 | Sentence: 1 | Thousands | NNS | O |
| 1 | Sentence: 1 | of | IN | O |
| 2 | Sentence: 1 | demonstrators | NNS | O |
| 3 | Sentence: 1 | have | VBP | O |
| 4 | Sentence: 1 | marched | VBN | O |
| 5 | Sentence: 1 | through | IN | O |
| 6 | Sentence: 1 | London | NNP | B-geo |
| 7 | Sentence: 1 | to | TO | O |
| 8 | Sentence: 1 | protest | VB | O |
| 9 | Sentence: 1 | the | DT | O |
| 10 | Sentence: 1 | war | NN | O |
| 11 | Sentence: 1 | in | IN | O |
| 12 | Sentence: 1 | Iraq | NNP | B-geo |
| 13 | Sentence: 1 | and | CC | O |
| 14 | Sentence: 1 | demand | VB | O |
| 15 | Sentence: 1 | the | DT | O |
| 16 | Sentence: 1 | withdrawal | NN | O |
| 17 | Sentence: 1 | of | IN | O |
| 18 | Sentence: 1 | British | JJ | B-gpe |
| 19 | Sentence: 1 | troops | NNS | O |
print("Unique words in corpus:", data['Word'].nunique())
print("Unique tags in corpus:", data['Tag'].nunique())
Unique words in corpus: 35178 Unique tags in corpus: 17
words = list(set(data["Word"].values))
words.append("ENDPAD")
num_words = len(words)
tags = list(set(data["Tag"].values))
num_tags = len(tags)
import plotly.express as px
fig = px.histogram(data[~data.Tag.str.contains("O")], x="Tag",color="Tag")
fig.show()
import tqdm
def sentence_integrate(data):
agg_func = lambda s: [(w, p, t) for w, p, t in zip(s["Word"].values.tolist(),
s["POS"].values.tolist(),
s["Tag"].values.tolist())]
return data.groupby('Sentence #').apply(agg_func).tolist()
sentences=sentence_integrate(data)
import plotly.express as px
fig = px.histogram(pd.DataFrame([len(s) for s in sentences],columns=['length']),x="length",marginal='box')
fig.show()
sentences[0]
[('Thousands', 'NNS', 'O'),
('of', 'IN', 'O'),
('demonstrators', 'NNS', 'O'),
('have', 'VBP', 'O'),
('marched', 'VBN', 'O'),
('through', 'IN', 'O'),
('London', 'NNP', 'B-geo'),
('to', 'TO', 'O'),
('protest', 'VB', 'O'),
('the', 'DT', 'O'),
('war', 'NN', 'O'),
('in', 'IN', 'O'),
('Iraq', 'NNP', 'B-geo'),
('and', 'CC', 'O'),
('demand', 'VB', 'O'),
('the', 'DT', 'O'),
('withdrawal', 'NN', 'O'),
('of', 'IN', 'O'),
('British', 'JJ', 'B-gpe'),
('troops', 'NNS', 'O'),
('from', 'IN', 'O'),
('that', 'DT', 'O'),
('country', 'NN', 'O'),
('.', '.', 'O')]
word2idx = {w: i + 1 for i, w in enumerate(words)}
tag2idx = {t: i for i, t in enumerate(tags)}
tag2idx
{'B-art': 2,
'B-eve': 1,
'B-geo': 9,
'B-gpe': 12,
'B-nat': 4,
'B-org': 8,
'B-per': 14,
'B-tim': 16,
'I-art': 6,
'I-eve': 5,
'I-geo': 11,
'I-gpe': 3,
'I-nat': 13,
'I-org': 0,
'I-per': 10,
'I-tim': 7,
'O': 15}
from tensorflow.keras.preprocessing.sequence import pad_sequences
max_len = 50
X = [[word2idx[w[0]] for w in s] for s in sentences]
X = pad_sequences(maxlen=max_len, sequences=X, padding="post", value=num_words-1)
y = [[tag2idx[w[2]] for w in s] for s in sentences]
y = pad_sequences(maxlen=max_len, sequences=y, padding="post", value=tag2idx["O"])
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
from tensorflow.keras import Model, Input
from tensorflow.keras.layers import LSTM, Embedding, Dense
from tensorflow.keras.layers import InputLayer, TimeDistributed, SpatialDropout1D, Bidirectional
from tensorflow import keras
model = keras.Sequential()
model.add(InputLayer((max_len)))
model.add(Embedding(input_dim=num_words, output_dim=max_len, input_length=max_len))
model.add(SpatialDropout1D(0.1))
model.add( Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.1)))
model.summary()
WARNING:tensorflow:Layer lstm_2 will not use cuDNN kernels since it doesn't meet the criteria. It will use a generic GPU kernel as fallback when running on GPU. WARNING:tensorflow:Layer lstm_2 will not use cuDNN kernels since it doesn't meet the criteria. It will use a generic GPU kernel as fallback when running on GPU. WARNING:tensorflow:Layer lstm_2 will not use cuDNN kernels since it doesn't meet the criteria. It will use a generic GPU kernel as fallback when running on GPU. Model: "sequential_3" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= embedding_3 (Embedding) (None, 50, 50) 1758950 _________________________________________________________________ spatial_dropout1d_2 (Spatial (None, 50, 50) 0 _________________________________________________________________ bidirectional_2 (Bidirection (None, 50, 200) 120800 ================================================================= Total params: 1,879,750 Trainable params: 1,879,750 Non-trainable params: 0 _________________________________________________________________
tf.keras.utils.plot_model(
model, to_file='model.png', show_shapes=True, show_dtype=False,
show_layer_names=True, rankdir='LR', expand_nested=True, dpi=300,
)
model.compile(optimizer="adam",
loss="sparse_categorical_crossentropy",
metrics=["accuracy"])
!pip install livelossplot
Requirement already satisfied: livelossplot in /usr/local/lib/python3.7/dist-packages (0.5.4) Requirement already satisfied: matplotlib in /usr/local/lib/python3.7/dist-packages (from livelossplot) (3.2.2) Requirement already satisfied: ipython in /usr/local/lib/python3.7/dist-packages (from livelossplot) (5.5.0) Requirement already satisfied: bokeh in /usr/local/lib/python3.7/dist-packages (from livelossplot) (2.3.2) Requirement already satisfied: numpy>=1.11 in /usr/local/lib/python3.7/dist-packages (from matplotlib->livelossplot) (1.19.5) Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.7/dist-packages (from matplotlib->livelossplot) (1.3.1) Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in /usr/local/lib/python3.7/dist-packages (from matplotlib->livelossplot) (2.4.7) Requirement already satisfied: python-dateutil>=2.1 in /usr/local/lib/python3.7/dist-packages (from matplotlib->livelossplot) (2.8.1) Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.7/dist-packages (from matplotlib->livelossplot) (0.10.0) Requirement already satisfied: simplegeneric>0.8 in /usr/local/lib/python3.7/dist-packages (from ipython->livelossplot) (0.8.1) Requirement already satisfied: decorator in /usr/local/lib/python3.7/dist-packages (from ipython->livelossplot) (4.4.2) Requirement already satisfied: prompt-toolkit<2.0.0,>=1.0.4 in /usr/local/lib/python3.7/dist-packages (from ipython->livelossplot) (1.0.18) Requirement already satisfied: traitlets>=4.2 in /usr/local/lib/python3.7/dist-packages (from ipython->livelossplot) (5.0.5) Requirement already satisfied: pexpect; sys_platform != "win32" in /usr/local/lib/python3.7/dist-packages (from ipython->livelossplot) (4.8.0) Requirement already satisfied: pickleshare in /usr/local/lib/python3.7/dist-packages (from ipython->livelossplot) (0.7.5) Requirement already satisfied: pygments in /usr/local/lib/python3.7/dist-packages (from ipython->livelossplot) (2.6.1) Requirement already satisfied: setuptools>=18.5 in /usr/local/lib/python3.7/dist-packages (from ipython->livelossplot) (57.0.0) Requirement already satisfied: pillow>=7.1.0 in /usr/local/lib/python3.7/dist-packages (from bokeh->livelossplot) (7.1.2) Requirement already satisfied: PyYAML>=3.10 in /usr/local/lib/python3.7/dist-packages (from bokeh->livelossplot) (3.13) Requirement already satisfied: typing-extensions>=3.7.4 in /usr/local/lib/python3.7/dist-packages (from bokeh->livelossplot) (3.7.4.3) Requirement already satisfied: tornado>=5.1 in /usr/local/lib/python3.7/dist-packages (from bokeh->livelossplot) (5.1.1) Requirement already satisfied: Jinja2>=2.9 in /usr/local/lib/python3.7/dist-packages (from bokeh->livelossplot) (2.11.3) Requirement already satisfied: packaging>=16.8 in /usr/local/lib/python3.7/dist-packages (from bokeh->livelossplot) (20.9) Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/dist-packages (from python-dateutil>=2.1->matplotlib->livelossplot) (1.15.0) Requirement already satisfied: wcwidth in /usr/local/lib/python3.7/dist-packages (from prompt-toolkit<2.0.0,>=1.0.4->ipython->livelossplot) (0.2.5) Requirement already satisfied: ipython-genutils in /usr/local/lib/python3.7/dist-packages (from traitlets>=4.2->ipython->livelossplot) (0.2.0) Requirement already satisfied: ptyprocess>=0.5 in /usr/local/lib/python3.7/dist-packages (from pexpect; sys_platform != "win32"->ipython->livelossplot) (0.7.0) Requirement already satisfied: MarkupSafe>=0.23 in /usr/local/lib/python3.7/dist-packages (from Jinja2>=2.9->bokeh->livelossplot) (2.0.1)
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from livelossplot.tf_keras import PlotLossesCallback
%%time
logdir="log/"
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=logdir)
chkpt = ModelCheckpoint("model_weights.h5", monitor='val_loss',verbose=1, save_best_only=True, save_weights_only=True, mode='min')
early_stopping = EarlyStopping(monitor='val_accuracy', min_delta=0, patience=1, verbose=0, mode='max', baseline=None, restore_best_weights=False)
callbacks = [PlotLossesCallback(), chkpt, early_stopping,tensorboard_callback]
history = model.fit(
x=x_train,
y=y_train,
validation_data=(x_test,y_test),
batch_size=32,
epochs=3,
callbacks=callbacks,
verbose=1
)
accuracy training (min: 0.938, max: 0.979, cur: 0.979) validation (min: 0.970, max: 0.979, cur: 0.979) Loss training (min: 0.083, max: 0.323, cur: 0.083) validation (min: 0.096, max: 0.137, cur: 0.096) Epoch 00003: val_loss improved from 0.10783 to 0.09561, saving model to model_weights.h5 CPU times: user 29min 48s, sys: 3min 28s, total: 33min 16s Wall time: 19min 52s
%load_ext tensorboard
%tensorboard --logdir log
print("Evaluate on test data")
results = model.evaluate(x_test, y_test, batch_size=128)
print("test loss: {} ".format(results[0]))
print("test accuracy: {} ".format(results[1]))
Evaluate on test data 75/75 [==============================] - 2s 29ms/step - loss: 0.0956 - accuracy: 0.9789 test loss: 0.09561429172754288 test accuracy: 0.9788948893547058
i = np.random.randint(0, x_test.shape[0])
print("This is sentence:",i)
p = model.predict(np.array([x_test[i]]))
p = np.argmax(p, axis=-1)
print("{:15}{:5}\t {}\n".format("Word", "True", "Pred"))
print("-" *30)
for w, true, pred in zip(x_test[i], y_test[i], p[0]):
print("{:15}{}\t{}".format(words[w-1], tags[true], tags[pred]))
This is sentence: 2745 Word True Pred ------------------------------ Shaima B-per B-org Rezayee I-per I-per had O O presented O O a O O music O O program O O on O O the O O privately-run O B-geo television O O channel O O Tolo B-org B-geo TV I-org O , O O and O O was O O shot O O in O O the O O head O O in O O the B-geo O Kabul I-geo B-geo neighborhood O O of O O Char B-org O Qala I-org I-geo on O O Wednesday B-tim B-tim . O O ties O O ties O O ties O O ties O O ties O O ties O O ties O O ties O O ties O O ties O O ties O O ties O O ties O O ties O O ties O O ties O O ties O O ties O O ties O O